Setup

##############################
###        SETUP           ###
##############################

# Loading required libraries
library(Seurat)
library(harmony)
library(scDblFinder)
library(ggplot2)
library(dplyr)
set.seed(1)

options(future.globals.maxSize = 3 * 1024^3)

# QC thresholds initial
mt_max <- 20
nfeature_min <- 200
nfeature_max <- 6000
ncount_min <- 500
ncount_max <- 40000
cluster_res <- 0.8

# S gene signatures from paper (sullivan et al. 2024)
m2ts_genes <- c("MRC1","MS4A4A","CD36","CCL13","CCL18","CCL23","SLC38A6","FGL2","FN1","MAF")
m1ts_genes <- c("CCR7","IL2RA","CXCL11","CCL19","CXCL10","PLA1A","PTX3")
ctlts_genes <- c("GZMA","GZMB","GZMH","GZMM","PRF1")

dir.create("data_processed", showWarnings = FALSE)
dir.create("tables", showWarnings = FALSE)
dir.create("figures", showWarnings = FALSE)

Data Download

Downloading the 10x matrices for GSE228598 from GEO. 10 of the 28 samples from the paper were publicly available.

##############################
###    DATA DOWNLOAD       ###
##############################

geo_dir <- "data/geo"
dir.create(geo_dir, recursive = TRUE, showWarnings = FALSE)

supp_url <- "https://www.ncbi.nlm.nih.gov/geo/download/?acc=GSE228598&format=file"
tar_file <- file.path(geo_dir, "GSE228598_RAW.tar")

# Only download if tar doesn't exist and no gz files present
gz_present <- length(list.files(geo_dir, pattern = "\\.gz$")) > 0
if (!file.exists(tar_file) && !gz_present) {
  download.file(supp_url, tar_file, mode = "wb")
}

# Only extract if no gz files present
if (!gz_present) {
  untar(tar_file, exdir = geo_dir)
}

Load and Merge

merge into one Seurat object.

##############################
###    LOAD & MERGE        ###
##############################

# Reorganizing 10x triplets into per-sample dirs for Read10X
gz_files <- list.files(geo_dir, pattern = "barcodes\\.tsv\\.gz$", full.names = TRUE)
sample_ids <- sub(".*_(P\\d+)_barcodes.*", "\\1", basename(gz_files))

obj_list <- lapply(seq_along(sample_ids), function(i) {
  sample_dir <- file.path(geo_dir, sample_ids[i])
  dir.create(sample_dir, showWarnings = FALSE)

  prefix <- sub("_barcodes\\.tsv\\.gz$", "_", basename(gz_files[i]))
  for (suffix in c("barcodes.tsv.gz", "features.tsv.gz", "matrix.mtx.gz")) {
    src <- file.path(geo_dir, paste0(prefix, suffix))
    dst <- file.path(sample_dir, suffix)
    if (!file.exists(dst)) file.copy(src, dst)
  }

  counts <- Read10X(sample_dir)
  obj <- CreateSeuratObject(counts, project = sample_ids[i], min.cells = 3, min.features = 200)
  obj$sample <- sample_ids[i]
  obj
})

names(obj_list) <- sample_ids
merged <- merge(obj_list[[1]], obj_list[-1], add.cell.ids = sample_ids)

rm(obj_list)
gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells  11429040  610.4   20222528 1080.0         NA  15987631  853.9
## Vcells 131491563 1003.3  342551921 2613.5      16384 308514726 2353.8

Quality Control

##############################
###    QUALITY CONTROL     ###
##############################

# Computing mito percentage
merged[["percent.mt"]] <- PercentageFeatureSet(merged, pattern = "^MT-")

# QC violin plots
VlnPlot(merged, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"),
        group.by = "sample", pt.size = 0, ncol = 3) &
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# QC scatter plots
p1 <- FeatureScatter(merged, feature1 = "nCount_RNA", feature2 = "nFeature_RNA",
                     group.by = "sample", pt.size = 0.1) + NoLegend()
p2 <- FeatureScatter(merged, feature1 = "nCount_RNA", feature2 = "percent.mt",
                     group.by = "sample", pt.size = 0.1) + NoLegend()
p1 + p2

nCount has extreme outliers that squash the violin. The scatter shows most cells cluster in a tight nCount/nFeature band with a few high-mito stragglers. Standard cutoffs below.

# Capped violin for UMI distribution
VlnPlot(merged, features = "nCount_RNA", group.by = "sample", pt.size = 0) +
  ylim(0, 50000) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Applying QC filters
merged <- subset(merged,
  nFeature_RNA > nfeature_min & nFeature_RNA < nfeature_max &
  nCount_RNA > ncount_min & nCount_RNA < ncount_max &
  percent.mt < mt_max)

cat("After QC:", ncol(merged), "cells,", nrow(merged), "genes\n")
## After QC: 51742 cells, 16653 genes
table(merged$sample)
## 
##   P1  P10   P2   P3   P4   P5   P6   P7   P8   P9 
## 8114 6378 6209 1896 6922 6483 4816 3945 4149 2830

Doublet Removal

Running scDblFinder per sample.

##############################
###   DOUBLET REMOVAL      ###
##############################

merged <- JoinLayers(merged)
sce <- as.SingleCellExperiment(merged)
sce <- scDblFinder(sce, samples = "sample")
##   |                                                                              |                                                                      |   0%  |                                                                              |=======                                                               |  10%  |                                                                              |==============                                                        |  20%  |                                                                              |=====================                                                 |  30%  |                                                                              |============================                                          |  40%  |                                                                              |===================================                                   |  50%  |                                                                              |==========================================                            |  60%  |                                                                              |=================================================                     |  70%  |                                                                              |========================================================              |  80%  |                                                                              |===============================================================       |  90%  |                                                                              |======================================================================| 100%
merged$doublet_class <- sce$scDblFinder.class
merged$doublet_score <- sce$scDblFinder.score

table(merged$doublet_class, merged$sample)
##          
##             P1  P10   P2   P3   P4   P5   P6   P7   P8   P9
##   singlet 7166 5853 5657 1808 6278 5902 4447 3686 3865 2676
##   doublet  948  525  552   88  644  581  369  259  284  154
merged <- subset(merged, doublet_class == "singlet")
cat("After doublet removal:", ncol(merged), "cells\n")
## After doublet removal: 47338 cells
rm(sce)
gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells  11909723  636.1   20222528 1080.0         NA  20222528 1080.0
## Vcells 214575754 1637.1  607698195 4636.4      16384 607698073 4636.4

~8% doublet rate across samples, about what you’d expect for standard 10x loading density.

Normalize and Integrate

Using LogNormalize + Harmony for batch correction across the 10 patient samples.

Tried SCTransform but hit the 16GB memory limit Lol, and the paper used standard normalization anyway so proceeding with that

##############################
###   NORMALIZATION &      ###
###   INTEGRATION          ###
##############################

# Saving QC checkpoint
saveRDS(merged, "data_processed/gse228598_merged_qc.rds")

# Standard log-normalization pipeline
merged <- NormalizeData(merged)
merged <- FindVariableFeatures(merged, nfeatures = 2000)
merged <- ScaleData(merged)
merged <- RunPCA(merged, verbose = FALSE)
# Elbow plot for PC selection
ElbowPlot(merged, ndims = 40)

Elbow flattens around PC 25-30, going with 30.

# Harmony batch correction
merged <- RunHarmony(merged, group.by.vars = "sample", reduction = "pca", reduction.save = "harmony")
merged <- RunUMAP(merged, reduction = "harmony", dims = 1:30, verbose = FALSE)
merged <- FindNeighbors(merged, reduction = "harmony", dims = 1:30, verbose = FALSE)
merged <- FindClusters(merged, resolution = cluster_res, verbose = FALSE)
# UMAP colored by cluster and by sample
p1 <- DimPlot(merged, reduction = "umap", group.by = "seurat_clusters", label = TRUE) + NoLegend()
p2 <- DimPlot(merged, reduction = "umap", group.by = "sample")
p1 + p2

ggsave("figures/umap_harmony_clusters.png", p1 + p2, width = 10, height = 4)

Samples mix well in the central mass, Harmony did its job like always, long live korsunsky lab. Isolated clusters on the right are patient-enriched, likely epithelial/mesothelial populations.

Broad Compartment Annotation

Assigning clusters to compartments using canonical marker expression.

##############################
###   COMPARTMENT          ###
###   ANNOTATION           ###
##############################

broad_markers <- c(
  "PTPRC",
  "CD3D", "CD3E", "NKG7",
  "CD79A", "MS4A1", "MZB1", "JCHAIN",
  "CD14", "CD68", "LYZ", "FCGR3A",
  "EPCAM", "KRT18", "KRT19",
  "PECAM1", "VWF",
  "COL1A1", "DCN",
  "MSLN", "CALB2"
)

DotPlot(merged, features = broad_markers, cluster.idents = TRUE) +
  RotatedAxis()

# DEGs per cluster to back up the compartment calls
markers <- FindAllMarkers(merged, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.5)
write.csv(markers, "tables/cluster_markers_all.csv", row.names = FALSE)
# Top 2 markers per cluster
top2 <- markers %>% group_by(cluster) %>% slice_max(avg_log2FC, n = 2)

DoHeatmap(merged, features = top2$gene, size = 3, cells = WhichCells(merged, downsample = 100)) +
  NoLegend() +
  theme(axis.text.y = element_text(size = 7))

ggsave("figures/heatmap_cluster_markers.png", width = 16, height = 8)

Clusters separate cleanly into the expected lineages. Myeloid clusters (0, 1, 3, 4, 8, 12, 14, 22) light up for LYZ/CD14/CD68. T/NK clusters have CD3D/NKG7. Epithelial clusters express keratins. Cluster 14 is a bit ambiguous – could be pDC or activated monocytes, but grouping with myeloid for now.

# Mapping clusters to broad compartments
compartment_map <- c(
  "0" = "Myeloid", "1" = "Myeloid", "3" = "Myeloid", "4" = "Myeloid",
  "8" = "Myeloid", "12" = "Myeloid", "14" = "Myeloid", "22" = "Myeloid",
  "2" = "T/NK", "5" = "T/NK", "6" = "T/NK", "7" = "T/NK",
  "9" = "T/NK", "17" = "T/NK", "18" = "T/NK", "19" = "T/NK", "21" = "T/NK",
  "11" = "B cell", "15" = "Plasma",
  "13" = "Epithelial", "16" = "Epithelial", "20" = "Epithelial",
  "10" = "Mesothelial", "23" = "Mesothelial"
)

merged$compartment <- unname(compartment_map[as.character(merged$seurat_clusters)])
table(merged$compartment)
## 
##      B cell  Epithelial Mesothelial     Myeloid      Plasma        T/NK 
##        1318        1562        1368       26921         275       15894
DimPlot(merged, group.by = "compartment", label = TRUE) + NoLegend()

ggsave("figures/umap_broad_compartments.png", width = 6, height = 5)

Myeloid dominant, T/NK second. Epithelial and mesothelial are mostly patient-specific as expected for peritoneal fluid.

# Exporting cell counts
write.csv(as.data.frame.matrix(table(merged$sample, merged$compartment)),
          "tables/broad_compartment_counts.csv")
write.csv(table(merged$sample, merged$compartment), "tables/cell_counts_by_sample.csv")

Macrophage Subset

Subsetting the myeloid compartment, re-clustering, and flagging M2 macrophages as CD68+CD163+ per Sullivan et al.

##############################
###   MACROPHAGE SUBSET    ###
##############################

macrophages <- subset(merged, compartment == "Myeloid")
macrophages <- NormalizeData(macrophages)
macrophages <- FindVariableFeatures(macrophages, nfeatures = 2000)
macrophages <- ScaleData(macrophages)
macrophages <- RunPCA(macrophages, verbose = FALSE)
macrophages <- RunHarmony(macrophages, group.by.vars = "sample", reduction = "pca", reduction.save = "harmony")
macrophages <- RunUMAP(macrophages, reduction = "harmony", dims = 1:20, verbose = FALSE)
macrophages <- FindNeighbors(macrophages, reduction = "harmony", dims = 1:20, verbose = FALSE)
# tried res 0.4 and 0.8 too, 0.6 gives cleanest separation
macrophages <- FindClusters(macrophages, resolution = 0.6, verbose = FALSE)

# Flagging CD68+CD163+ M2 macrophages
cd68_expr <- GetAssayData(macrophages, layer = "data")["CD68", ]
cd163_expr <- GetAssayData(macrophages, layer = "data")["CD163", ]
macrophages$m2_flag <- ifelse(cd68_expr > 0 & cd163_expr > 0, "CD68+CD163+", "Other")

table(macrophages$m2_flag)
## 
## CD68+CD163+       Other 
##       15148       11773
p1 <- DimPlot(macrophages, label = TRUE) + NoLegend() + ggtitle("Myeloid clusters")
p2 <- DimPlot(macrophages, group.by = "m2_flag") + ggtitle("CD68+CD163+ M2")
p1 + p2

Over half the myeloid cells are CD68+CD163+. They cluster together on the UMAP rather than scattering randomly, which suggests M2 polarization is a real transcriptional state here, not just noise from dropout.

Module Scores

Scoring all macrophages for the three Sullivan et al. signatures.

##############################
###    MODULE SCORES       ###
##############################

macrophages <- AddModuleScore(macrophages, features = list(m2ts_genes), name = "M2TS")
macrophages <- AddModuleScore(macrophages, features = list(m1ts_genes), name = "M1TS")
macrophages <- AddModuleScore(macrophages, features = list(ctlts_genes), name = "CTLTS")

saveRDS(macrophages, "data_processed/gse228598_macrophages.rds")
FeaturePlot(macrophages, features = c("M2TS1", "M1TS1", "CTLTS1"), ncol = 3, order = TRUE) &
  scale_color_viridis_c()

ggsave("figures/umap_m2ts_macrophage_subset.png", width = 10, height = 4)
VlnPlot(macrophages, features = "M2TS1", group.by = "m2_flag", pt.size = 0) +
  ggtitle("M2TS score by CD68+CD163+ status")

ggsave("figures/violin_m2ts_by_m2flag.png", width = 6, height = 4)
# Statistical test for M2TS enrichment
wt <- wilcox.test(M2TS1 ~ m2_flag, data = macrophages@meta.data)
cat("Wilcoxon rank-sum test: W =", wt$statistic, ", p =", format.pval(wt$p.value, digits = 3), "\n")
## Wilcoxon rank-sum test: W = 112193390 , p = <2e-16

M2TS tracks with the CD68+CD163+ phenotype as expected (Wilcoxon p < 2.2e-16). This replicates the paper’s core finding from their Figure 3A. M1TS and CTLTS are low in macrophages which makes sense since those are T-cell signatures.

Per-sample and Per-celltype Summaries

##############################
###    SUMMARY STATS       ###
##############################

m2_cells <- subset(macrophages, m2_flag == "CD68+CD163+")

m2ts_by_sample <- m2_cells@meta.data |>
  group_by(sample) |>
  summarise(
    n_m2_cells = n(),
    mean_m2ts = mean(M2TS1),
    median_m2ts = median(M2TS1),
    .groups = "drop"
  )

write.csv(m2ts_by_sample, "tables/m2ts_summary_by_sample.csv", row.names = FALSE)
m2ts_by_sample
## # A tibble: 10 × 4
##    sample n_m2_cells mean_m2ts median_m2ts
##    <chr>       <int>     <dbl>       <dbl>
##  1 P1           1147  -0.0762     -0.103  
##  2 P10          3334   0.0277      0.0328 
##  3 P2            490  -0.129      -0.138  
##  4 P3            541  -0.0862     -0.0903 
##  5 P4           2221   0.00603     0.00700
##  6 P5           1368  -0.0906     -0.101  
##  7 P6           1187  -0.0605     -0.0723 
##  8 P7           1490   0.164       0.173  
##  9 P8           2254   0.169       0.167  
## 10 P9           1116  -0.0234     -0.0275

M2TS varies a lot across patients. P7 and P8 are the highest, P2 is the lowest. The paper used this kind of inter-patient heterogeneity to stratify into M2TS-high vs M2TS-low groups.

# Scoring all cells for cross-compartment comparison
merged <- AddModuleScore(merged, features = list(m2ts_genes), name = "M2TS")
merged <- AddModuleScore(merged, features = list(m1ts_genes), name = "M1TS")
merged <- AddModuleScore(merged, features = list(ctlts_genes), name = "CTLTS")

m2ts_by_celltype <- merged@meta.data |>
  group_by(compartment) |>
  summarise(
    n_cells = n(),
    mean_m2ts = mean(M2TS1),
    mean_m1ts = mean(M1TS1),
    mean_ctlts = mean(CTLTS1),
    .groups = "drop"
  )

write.csv(m2ts_by_celltype, "tables/m2ts_summary_by_celltype.csv", row.names = FALSE)
m2ts_by_celltype
## # A tibble: 6 × 5
##   compartment n_cells mean_m2ts mean_m1ts mean_ctlts
##   <chr>         <int>     <dbl>     <dbl>      <dbl>
## 1 B cell         1318   -0.245     0.0814     -0.109
## 2 Epithelial     1562   -0.309    -0.0465     -0.198
## 3 Mesothelial    1368   -0.119    -0.0705     -0.260
## 4 Myeloid       26921    0.0959   -0.0144     -0.208
## 5 Plasma          275   -0.283    -0.0208      0.124
## 6 T/NK          15894   -0.219     0.0330      0.366

Signatures land where they should – M2TS highest in myeloid, CTLTS highest in T/NK. Sanity check passed.

Azimuth Reference Mapping

Using the Azimuth PBMC reference to validate marker-based annotation and get finer-grained immune subtypes. These labels get saved so Part 2 can use them for CellChat.

##############################
###   AZIMUTH MAPPING      ###
##############################

library(Azimuth)

immune <- subset(merged, compartment %in% c("Myeloid", "T/NK", "B cell", "Plasma"))
immune <- RunAzimuth(immune, reference = "pbmcref")

table(immune$predicted.celltype.l1)
## 
##       B   CD4 T   CD8 T      DC    Mono      NK   other other T 
##    1521    6254    4054    3133   27110     855     437    1044
table(immune$predicted.celltype.l2)
## 
##    B intermediate          B memory           B naive         CD14 Mono 
##               221               341               817             21587 
##         CD16 Mono           CD4 CTL         CD4 Naive CD4 Proliferating 
##              5365                84               107               305 
##           CD4 TCM           CD4 TEM         CD8 Naive CD8 Proliferating 
##              4994               564               305                43 
##           CD8 TCM           CD8 TEM              cDC1              cDC2 
##               339              3378               145              3025 
##               dnT               gdT              HSPC               ILC 
##               209               623                 1                56 
##              MAIT                NK  NK Proliferating     NK_CD56bright 
##               288               631                86               134 
##               pDC       Plasmablast          Platelet              Treg 
##                81               140               390               149

Azimuth breaks the myeloid compartment into mostly CD14 Mono and CD16 Mono with a smaller cDC population. T cells resolve into CD4 TCM, CD8 TEM, MAIT, gdT, Treg subsets – good granularity for CellChat.

# Cross-tabulating marker-based vs Azimuth labels
cross_tab <- table(
  marker = immune$compartment,
  azimuth = immune$predicted.celltype.l1
)
cross_tab
##          azimuth
## marker        B CD4 T CD8 T    DC  Mono    NK other other T
##   B cell   1289     4     6     2    13     2     1       1
##   Myeloid     3    91    11   409 26067     0   335       5
##   Plasma    177     7     2    79     7     1     2       0
##   T/NK       52  6152  4035  2643  1023   852    99    1038
write.csv(as.data.frame.matrix(cross_tab), "tables/marker_vs_azimuth_table.csv")

Strong concordance overall. Myeloid maps almost entirely to Mono, B cells map cleanly to B. The main discrepancy is a chunk of our T/NK cells that Azimuth calls DC – probably pDCs sitting at the boundary. Not worrying about it for now.

# Transferring Azimuth labels back to the full merged object
merged$azimuth_l1 <- NA
merged$azimuth_l2 <- NA
overlap <- intersect(Cells(immune), Cells(merged))
merged$azimuth_l1[match(overlap, Cells(merged))] <- immune$predicted.celltype.l1[match(overlap, Cells(immune))]
merged$azimuth_l2[match(overlap, Cells(merged))] <- immune$predicted.celltype.l2[match(overlap, Cells(immune))]

# Saving with all annotations (compartment + Azimuth) for Part 2
saveRDS(merged, "data_processed/gse228598_harmony.rds")